In [1]:
%cd /mnt/Data/Jupyter/VisCA1
/mnt/Data/Jupyter/VisCA1
In [2]:
import pandas as pd
In [3]:
import warnings
warnings.filterwarnings("ignore")
%load_ext rpy2.ipython
In [4]:
%%R
require(tidyverse)
require(ggplot2)
require(plotluck)
require(reshape2)
require(cowplot)
theme_set(theme_minimal())

Exercise 1

Using Entity Type, create a tree map showing the sum of total-lobbying by type of government.

• What did you find out?

• What other questions would you want to answer?

• What calculations could you create that would inform your reporting? Identify up to three calculations (exploring Create Calculations feature).

In [5]:
lobbying = pd.read_excel("lobbying_data.xlsx")
lobbying["Total-Lobbying"] = lobbying["Compensation"] + lobbying["Expenses"]
lobbyingTree = lobbying[["entity_type", "Entity-cleaned", "Total-Lobbying"]]
lobbyingTree.rename(columns={"entity_type":"Type", "Entity-cleaned":"Entity", "Total-Lobbying":"Lobbying"}, inplace=True)
lobbyingTree.head()
Out[5]:
Type Entity Lobbying
0 CITIES Algona 18000.0
1 CITIES Arlington 13687.5
2 CITIES Auburn 1900.0
3 CITIES Battle Ground 12000.0
4 CITIES Bellevue 19000.0
In [6]:
%%R -i lobbyingTree
require(data.tree)

lobbyingTree$pathString <- paste('LOBBYING_TREE', lobbyingTree$Type, lobbyingTree$Entity, sep = "/")
tree <- as.Node(lobbyingTree[,])
print(tree, pruneMethod='dist', limit=20)
                                                               levelName
1  LOBBYING_TREE                                                        
2   ¦--CITIES                                                           
3   ¦   ¦--Algona                                                       
4   ¦   ¦--Arlington                                                    
5   ¦   °--... 48 nodes w/ 0 sub                                        
6   ¦--COUNTIES                                                         
7   ¦   ¦--Asotin County                                                
8   ¦   ¦--Clark County                                                 
9   ¦   °--... 16 nodes w/ 0 sub                                        
10  ¦--OTHER                                                            
11  ¦   ¦--Association Of Washington Cities                             
12  ¦   ¦--Association Of Washington State Public Facilities Districts  
13  ¦   °--... 18 nodes w/ 0 sub                                        
14  ¦--PORTS                                                            
15  ¦   ¦--Port Of Bellingham                                           
16  ¦   ¦--Port Of Bremerton                                            
17  ¦   °--... 9 nodes w/ 0 sub                                         
18  ¦--PUBLIC FACILITIES DISTRICTS                                      
19  ¦   ¦--Spokane Public Facilities District                           
20  ¦   °--Washington State Convention Center Public Facilities District
21  ¦--SCHOOL DISTRICTS                                                 
22  ¦   ¦--Seattle Public Schools                                       
23  ¦   °--... 3 nodes w/ 0 sub                                         
24  ¦--TRIBES                                                           
25  ¦   °--... 17 nodes w/ 0 sub                                        
26  °--UTILITY DISTRICTS                                                
27      °--... 9 nodes w/ 0 sub                                         
In [7]:
%%R
tree$Do(function(x) x$aggLobbying <- Aggregate(x, 'Lobbying', sum))
tree$Sort(attribute='Lobbying', decreasing=TRUE, recursive=TRUE)
print(tree, 'Lobbying', 'aggLobbying', pruneMethod='dist', limit=14)
                                      levelName  Lobbying aggLobbying
1  LOBBYING_TREE                                       NA  2523089.92
2   ¦--CITIES                                          NA   870602.31
3   ¦   ¦--Seattle                              138092.00   138092.00
4   ¦   ¦--Tacoma                                74100.00    74100.00
5   ¦   °--... 48 nodes w/ 0 sub                       NA          NA
6   ¦--COUNTIES                                        NA   321889.00
7   ¦   ¦--King County                           96654.00    96654.00
8   ¦   ¦--Pierce County                         67802.00    67802.00
9   ¦   °--... 16 nodes w/ 0 sub                       NA          NA
10  ¦--OTHER                                           NA   408145.84
11  ¦   ¦--Washington Indian Gaming Association  79319.35    79319.35
12  ¦   °--... 19 nodes w/ 0 sub                       NA          NA
13  ¦--PORTS                                           NA   218413.61
14  ¦   °--... 11 nodes w/ 0 sub                       NA          NA
15  ¦--PUBLIC FACILITIES DISTRICTS                     NA    24073.45
16  ¦   °--... 2 nodes w/ 0 sub                        NA          NA
17  ¦--SCHOOL DISTRICTS                                NA    60500.00
18  ¦   °--... 4 nodes w/ 0 sub                        NA          NA
19  ¦--TRIBES                                          NA   490778.71
20  ¦   °--... 17 nodes w/ 0 sub                       NA          NA
21  °--UTILITY DISTRICTS                               NA   128687.00
22      °--... 9 nodes w/ 0 sub                        NA          NA
In [8]:
%%R -w 8 -h 5 --units in -r 150
require(treemap)

treemap(ToDataFrameNetwork(tree, 'Lobbying', direction='climb')[9:139,], 
        index=c('from', 'to'), vSize='Lobbying', vColor='Lobbying', type='value', palette=heat.colors(-55))
In [9]:
%%R -w 12 -h 12 --units in -r 100
require(igraph)

treeGraph <- as.igraph(tree, 'aggLobbying', directed=F, direction='climb')
vertex_attr(treeGraph, 'aggLobbying')[1] <- 0

plot(treeGraph, vertex.color='lightblue', vertex.label.family='Segoe UI', 
     vertex.label.cex=0.55, vertex.label.color='black', layout=layout.kamada.kawai,
     vertex.size=vertex_attr(treeGraph, 'aggLobbying') / (max(vertex_attr(treeGraph, 'aggLobbying')) * 0.035))
In [10]:
longlat = pd.read_csv("zip_codes_states.csv")
longlat = longlat[longlat["state"] == "WA"]
longlat.drop_duplicates(subset="city", inplace=True)
longlat.set_index("city", inplace=True)
longlat.head()
Out[10]:
zip_code latitude longitude state county
city
Auburn 98001 47.465495 -121.821908 WA King
Federal Way 98003 47.432251 -121.803388 WA King
Bellevue 98004 47.615471 -122.207221 WA King
Black Diamond 98010 47.310568 -121.998721 WA King
Bothell 98011 47.750689 -122.214376 WA King
In [11]:
lobbyingGeo = lobbying[lobbying["entity_type"] == "CITIES"].join(longlat, how="left", on="City")[
    ["Total-Lobbying", "City", "county", "longitude", "latitude"]]
lobbyingGeo.rename(columns={"Total-Lobbying":"lobbying", "City":"city"}, inplace=True)
lobbyingGeo.drop(lobbyingGeo.index[44], inplace=True) # duplicate row
lobbyingGeo[lobbyingGeo.isnull().any(1)]
Out[11]:
lobbying city county longitude latitude
0 18000.00 Algona NaN NaN NaN
6 18000.00 Burien NaN NaN NaN
8 7166.68 Covington NaN NaN NaN
12 12061.47 Fife NaN NaN NaN
38 7000.00 Seatac NaN NaN NaN
41 25900.00 Shoreline NaN NaN NaN
In [12]:
lobbyingGeo.loc[0, "county"], lobbyingGeo.loc[0, "latitude"], lobbyingGeo.loc[0, "longitude"] = "King", 47.281954, -122.250388
lobbyingGeo.loc[6, "county"], lobbyingGeo.loc[6, "latitude"], lobbyingGeo.loc[6, "longitude"] = "King", 47.46917, -122.364291
lobbyingGeo.loc[8, "county"], lobbyingGeo.loc[8, "latitude"], lobbyingGeo.loc[8, "longitude"] = "King", 47.364791, -122.104563
lobbyingGeo.loc[12, "county"], lobbyingGeo.loc[12, "latitude"], lobbyingGeo.loc[12, "longitude"] = "Pierce", 47.23206, -122.351726
lobbyingGeo.loc[38, "county"], lobbyingGeo.loc[38, "latitude"], lobbyingGeo.loc[38, "longitude"] = "King", 47.44333, -122.298767
lobbyingGeo.loc[41, "county"], lobbyingGeo.loc[41, "latitude"], lobbyingGeo.loc[41, "longitude"] = "King", 47.756904, -122.342414

lobbyingGeo.head()

# data retrieved from http://citylatitudelongitude.com/
Out[12]:
lobbying city county longitude latitude
0 18000.0 Algona King -122.250388 47.281954
1 13687.5 Arlington Snohomish -121.959469 48.181498
2 1900.0 Auburn King -121.821908 47.465495
3 12000.0 Battle Ground Clark -122.531645 45.803592
4 19000.0 Bellevue King -122.207221 47.615471
In [13]:
duplicates = len(lobbyingGeo.drop_duplicates(subset=["longitude", "latitude"])) != len(lobbyingGeo)
print("Any Duplicate coordinates present?:", duplicates)
Any Duplicate coordinates present?: True
In [14]:
lobbyingGeo[lobbyingGeo["latitude"] == 
            float(lobbyingGeo[~lobbyingGeo.isin(lobbyingGeo.drop_duplicates(
                subset=["longitude", "latitude"]))].dropna()["latitude"])]
Out[14]:
lobbying city county longitude latitude
11 22490.0 Federal Way King -121.803388 47.432251
39 138092.0 Seattle King -121.803388 47.432251
In [15]:
lobbyingGeo.loc[11, "latitude"], lobbyingGeo.loc[41, "longitude"] = 47.322323, -122.312622

# sort out seattle location
In [16]:
print("Any Duplicate coordinates present?:", len(lobbyingGeo.drop_duplicates(subset=["longitude", "latitude"])) != len(lobbyingGeo))
Any Duplicate coordinates present?: False
In [17]:
%%R -i lobbyingGeo -w 6 -h 4 --units in -r 200
require(ggmap)

lat <- c(min(lobbyingGeo$latitude), max(lobbyingGeo$latitude))
lon <- c(min(lobbyingGeo$longitude), max(lobbyingGeo$longitude))

plot <- get_map(location=c(mean(lon), mean(lat)), zoom=6, source='google', maptype='terrain')
ggmap(plot, extent='device') + 
  geom_density2d(data=lobbyingGeo, aes(x=longitude, y=latitude), size=0.1, color='black') + 
  stat_density2d(data=lobbyingGeo, 
                 aes(x=longitude, y=latitude, fill=..level.., alpha=..level..), size=0.01, 
                 bins=16, geom='polygon') + 
  scale_fill_gradient(low='green', high='red', guide=FALSE) + 
  scale_alpha(range=c(0, 0.3), guide=FALSE) +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand=c(0.4, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand=c(0.4, 0))

# talk about using average measures to set map location
In [18]:
%%R -i lobbyingGeo -w 5 -h 5 --units in -r 200
plot <- get_map(location=c(median(lobbyingGeo$longitude), median(lobbyingGeo$latitude)), 
                zoom=9, source='google', maptype='roadmap')
ggmap(plot, extent='device') + 
  geom_point(aes(x=longitude, y=latitude), data=lobbyingGeo, col="darkred", alpha=0.4, 
              size=lobbyingGeo$lobbying/max(lobbyingGeo$lobbying)*15) +
  scale_alpha(range = c(0, 0.3), guide = FALSE)
In [19]:
# import matplotlib.pyplot as plt
# plt.rcParams['figure.figsize'] = (20,10)

# refinery.plot()
In [20]:
%%R -o all_states
# require(maps)

all_states <- map_data("state")
In [21]:
refinery = pd.read_excel("19. Data Set - Module 6 - accidents-state.xlsx")[:-1]
refinery.head()
Out[21]:
State # of RMP facilities # of accidents # of deaths # of injuries # evacuated Property damage (dollars)
0 Alabama 196 31 0 23 59 182682
1 Alaska 43 7 0 1 20 103
2 Arizona 110 7 0 3 35 200000
3 Arkansas 165 47 0 63 1327 52232813
4 California 867 76 0 14076 65426 8531448
In [22]:
refinery["State"] = refinery["State"].str.lower()
In [23]:
nulls = all_states.join(refinery.set_index("State"), how="right", on="region")
nulls[nulls.isnull().any(1) == True]

# note on how guam, hawaii and alaska are low so will be ignored.  
# puerto rico and the virgin islands very high so will be ignored and explored for further interest.
# all will be left in the frame in order to facilitate any potenential calculations.
Out[23]:
long lat group order region subregion # of RMP facilities # of accidents # of deaths # of injuries # evacuated Property damage (dollars)
15599 NaN NaN NaN NaN alaska NaN 43 7 0 1 20 103
15599 NaN NaN NaN NaN guam NaN 4 0 0 0 0 0
15599 NaN NaN NaN NaN hawaii NaN 16 4 0 1 600 500
15599 NaN NaN NaN NaN puerto rico NaN 66 3 0 71 2000 500000
15599 NaN NaN NaN NaN virgin islands of the u.s. NaN 1 4 0 16 1300 8540000
In [24]:
refinery.rename(columns={"# of RMP facilities":"facility_count", "# of accidents":"accidents", 
                            "# of deaths":"deaths", "# of injuries":"injuries", 
                            "# evacuated":"evacuated", "Property damage (dollars)":"property_damage"}, inplace=True)
refineryGeo = all_states.join(refinery.set_index("State"), how="right", on="region")
refineryGeo.drop(columns="subregion", inplace=True)
refineryGeo.head()
Out[24]:
long lat group order region facility_count accidents deaths injuries evacuated property_damage
1 -87.462006 30.389681 1.0 1.0 alabama 196 31 0 23 59 182682
2 -87.484932 30.372492 1.0 2.0 alabama 196 31 0 23 59 182682
3 -87.525032 30.372492 1.0 3.0 alabama 196 31 0 23 59 182682
4 -87.530762 30.332386 1.0 4.0 alabama 196 31 0 23 59 182682
5 -87.570869 30.326654 1.0 5.0 alabama 196 31 0 23 59 182682
In [25]:
%%R -i refineryGeo -w 7 -h 3 --units in -r 180

lat <- c(min(all_states$lat), max(all_states$lat))
lon <- c(min(all_states$lon), max(all_states$lon))

plot <- get_map(location=c(mean(lon), mean(lat)), zoom=4, source='stamen', maptype='toner')

facility_count <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=facility_count), colour='black', size=0.1, alpha=0.4) +
  scale_fill_continuous(low='thistle2', high='darkblue', guide=FALSE)

damage <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=property_damage), colour='black', size=0.1, alpha=0.4) +
  scale_fill_continuous(low='thistle2', high='chocolate', guide=FALSE)

injuries <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=injuries), colour='black', size=0.1, alpha=0.4) +
  scale_fill_continuous(low='thistle2', high='darkred', guide=FALSE)

accidents <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=accidents), colour='black', size=0.1, alpha=0.4) +
  scale_fill_continuous(low='thistle2', high='darkblue', guide=FALSE)

evacuated <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=evacuated), colour='black', size=0.1, alpha=0.4) +
  scale_fill_continuous(low='thistle2', high='chocolate', guide=FALSE)

deaths <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=deaths), colour='black', size=0.1, alpha=0.4) +
  scale_fill_continuous(low='thistle2', high='darkred', guide=FALSE)

plot_grid(facility_count, damage, deaths, accidents, evacuated, injuries, nrow=2, ncol=3, 
          labels=c('Facilities', 'Damage', 'Deaths', 'Accidents', 'Evacuated', 'Injuries'))
In [26]:
refinery = refinery.set_index("State").drop(index=[region for region in nulls[nulls.isnull().any(1) == True].region]).reset_index()
refinery["scaled_accidents"] = pd.DataFrame([row["accidents"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["scaled_damage"] = pd.DataFrame([row["property_damage"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["scaled_evacuated"] = pd.DataFrame([row["evacuated"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["scaled_deaths"] = pd.DataFrame([row["deaths"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["scaled_injuries"] = pd.DataFrame([row["injuries"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 

refinery_scaled = pd.concat([refinery.iloc[:,[0, 1, 2, 3, 4, 5, 6]], 
                                pd.DataFrame(scaler.fit_transform(refinery.iloc[:,[7, 8, 9, 10, 11]]), 
             columns=["scaled_accidents", "scaled_damage", "scaled_evacuated", "scaled_deaths", "scaled_injuries"])], axis=1)

refineryGeo_scaled = all_states.join(refinery_scaled.set_index("State"), how="right", on="region")
refineryGeo_scaled.drop(columns="subregion", inplace=True)
refineryGeo_scaled.head()
Out[26]:
long lat group order region facility_count accidents deaths injuries evacuated property_damage scaled_accidents scaled_damage scaled_evacuated scaled_deaths scaled_injuries
1 -87.462006 30.389681 1.0 1 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
2 -87.484932 30.372492 1.0 2 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
3 -87.525032 30.372492 1.0 3 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
4 -87.530762 30.332386 1.0 4 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
5 -87.570869 30.326654 1.0 5 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
In [27]:
%%R -i refineryGeo_scaled -w 7 -h 3 --units in -r 180

lat <- c(min(all_states$lat), max(all_states$lat))
lon <- c(min(all_states$lon), max(all_states$lon))

plot <- get_map(location=c(mean(lon), mean(lat)), zoom=4, source='stamen', maptype='toner')

facility_count <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=facility_count), colour='black', size=0.1, alpha=0.4) +
  scale_fill_continuous(low='thistle2', high='darkblue', guide=FALSE)

damage <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=scaled_damage), colour='black', size=0.1, alpha=0.4) +
  lims(fill=c(0, 1)) +
  scale_fill_continuous(low='thistle2', high='chocolate') +
  theme(legend.position='none')

injuries <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=scaled_injuries), colour='black', size=0.1, alpha=0.4) +
  lims(fill=c(0, 1)) +
  scale_fill_continuous(low='thistle2', high='darkred') +
  theme(legend.position='none')

accidents <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=scaled_accidents), colour='black', size=0.1, alpha=0.4) +
  lims(fill=c(0, 1)) +
  scale_fill_continuous(low='thistle2', high='darkblue') +
  theme(legend.position='none')

evacuated <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=scaled_evacuated), colour='black', size=0.1, alpha=0.4) +
  lims(fill=c(0, 1)) +
  scale_fill_continuous(low='thistle2', high='chocolate') +
  theme(legend.position='none')

deaths <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=scaled_deaths), colour='black', size=0.1, alpha=0.4) +
  lims(fill=c(0, 1)) +
  scale_fill_continuous(low='thistle2', high='darkred') +
  theme(legend.position='none')

plot_grid(facility_count, damage, deaths, accidents, evacuated, injuries, nrow=2, ncol=3, 
          labels=c('Facilities', 'Damage', 'Deaths', 'Accidents', 'Evacuated', 'Injuries'))
In [39]:
%%R -i refineryGeo_scaled -w 2 -h 3 --units in -r 180

lat <- c(min(all_states$lat), max(all_states$lat))
lon <- c(min(all_states$lon), max(all_states$lon))

plot <- get_map(location=c(mean(lon), mean(lat)), zoom=4, source='stamen', maptype='toner')

collateral <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=(scaled_damage+scaled_evacuated)/2), colour='black', size=0.1, alpha=0.4) +
  lims(fill=c(0, 1)) +
  scale_fill_continuous(low='thistle2', high='chocolate') +
  theme(plot.title=element_text(size=10), legend.position='none') +
  ggtitle('Collateral Damage')   

human_cost <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
  geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=(scaled_injuries+scaled_deaths)/2), colour='black', size=0.1, alpha=0.4) +
  lims(fill=c(0, 1)) +
  scale_fill_continuous(low='thistle2', high='darkred') +
  theme(plot.title=element_text(size=10), legend.position='none') +
  ggtitle('Human Cost')
 
plot_grid(collateral, human_cost, nrow=2, 
          labels=NULL)
In [37]:
%%R -i refineryGeo_scaled -w 7 -h 3 --units in -r 180

lat <- c(min(all_states$lat), max(all_states$lat))
lon <- c(min(all_states$lon), max(all_states$lon))

plot <- get_map(location=c(mean(lon), mean(lat)), zoom=4, source='stamen', maptype='toner')

collateral <- ggmap(plot, extent='device') +
  scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
  scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
#   geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=(scaled_damage+scaled_evacuated)/2), colour='black', size=0.1, alpha=0.4) +
#   lims(fill=c(0, 1)) +
#   scale_fill_continuous(low='thistle2', high='chocolate') +
  theme(plot.title=element_text(size=10), legend.position='none') +
  ggtitle('Collateral Damage') +
  geom_density2d(data=refineryGeo_scaled, aes(x=long, y=lat), size=0.1, color='black') + 
  stat_density2d(data=refineryGeo_scaled, 
                 aes(x=long, y=lat, fill=..level.., alpha=..level..), size=0.01, 
                 bins=16, geom='polygon') + 
  scale_fill_gradient(low='green', high='red', guide=FALSE) + 
  scale_alpha(range=c(0, 0.3), guide=FALSE)

collateral
    
# human_cost <- ggmap(plot, extent='device') +
#   scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
#   scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) + 
#   geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=(scaled_injuries+scaled_deaths)/2), colour='black', size=0.1, alpha=0.4) +
#   lims(fill=c(0, 1)) +
#   scale_fill_continuous(low='thistle2', high='darkred') +
#   theme(plot.title=element_text(size=10), legend.position='none') +
#   ggtitle('Human Cost')
 
# plot_grid(collateral, human_cost, nrow=2, 
#           labels=NULL)
In [79]:
import bs4 as bs
import requests

def state_land_area():
    resp = requests.get("https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_area")
    soup = bs.BeautifulSoup(resp.text, "lxml")
    table = soup.find("table", {"class" : "wikitable sortable"})
    name = []
    area = []
    for row in table.findAll("tr")[2:-4]:
        statename = row.findAll("td")[0].text
        name.append(statename[1:])
    for row in table.findAll("tr")[2:-4]:
        stateland = row.findAll("td")[6].text
        area.append(stateland)
    landmass = pd.DataFrame()
    landmass["name"] = name
    landmass["area_km"] = area    
    return landmass

landmass = state_land_area()
landmass

# gets data from here:
# https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_area
Out[79]:
name area_km
0 Alaska 1,477,953
1 Texas 676,587
2 California 403,466
3 Montana 376,962
4 New Mexico 314,161
5 Arizona 294,207
6 Nevada 284,332
7 Colorado 268,431
8 Oregon 248,608
9 Wyoming 251,470
10 Michigan 146,435
11 Minnesota 206,232
12 Utah 212,818
13 Idaho 214,045
14 Kansas 211,754
15 Nebraska 198,974
16 South Dakota 196,350
17 Washington 172,119
18 North Dakota 178,711
19 Oklahoma 177,660
20 Missouri 178,040
21 Florida 138,887
22 Wisconsin 140,268
23 Georgia 148,959
24 Illinois 143,793
25 Iowa 144,669
26 New York 122,057
27 North Carolina 125,920
28 Arkansas 134,771
29 Alabama 131,171
30 Louisiana 111,898
31 Mississippi 121,531
32 Pennsylvania 115,883
33 Ohio 105,829
34 Virginia 102,279
35 Tennessee 106,798
36 Kentucky 102,269
37 Indiana 92,789
38 Maine 79,883
39 South Carolina 77,857
40 West Virginia 62,259
41 Maryland 25,142
42 Hawaii 16,635
43 Massachusetts 20,202
44 Vermont 23,871
45 New Hampshire 23,187
46 New Jersey 19,047
47 Connecticut 12,542
48 Delaware 5,047
49 Rhode Island 2,678
50 District of Columbia 158
51 Puerto Rico 8,868
52 Northern Mariana Islands 472
53 United States Virgin Islands 348
54 American Samoa 198
55 Guam 543
In [26]:
refinery = refinery.set_index("State").drop(index=[region for region in nulls[nulls.isnull().any(1) == True].region]).reset_index()
refinery["land_scaled_accidents"] = pd.DataFrame([row["accidents"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["land_scaled_damage"] = pd.DataFrame([row["property_damage"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["land_scaled_evacuated"] = pd.DataFrame([row["evacuated"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["land_scaled_deaths"] = pd.DataFrame([row["deaths"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["land_scaled_injuries"] = pd.DataFrame([row["injuries"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery

scaler = MinMaxScaler() 
refinery_scaled = pd.concat([refinery.iloc[:,[0, 1, 2, 3, 4, 5, 6]], 
                                pd.DataFrame(scaler.fit_transform(refinery.iloc[:,[7, 8, 9, 10, 11]]), 
             columns=["scaled_accidents", "scaled_damage", "scaled_evacuated", "scaled_deaths", "scaled_injuries"])], axis=1)

refineryGeo_scaled = all_states.join(refinery_scaled.set_index("State"), how="right", on="region")
refineryGeo_scaled.drop(columns="subregion", inplace=True)
refineryGeo_scaled.head()
Out[26]:
long lat group order region facility_count accidents deaths injuries evacuated property_damage scaled_accidents scaled_damage scaled_evacuated scaled_deaths scaled_injuries
1 -87.462006 30.389681 1.0 1 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
2 -87.484932 30.372492 1.0 2 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
3 -87.525032 30.372492 1.0 3 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
4 -87.530762 30.332386 1.0 4 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228
5 -87.570869 30.326654 1.0 5 alabama 196 31 0 23 59 182682 0.158163 0.000785 0.003989 0.0 0.007228